from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, ArrayType, StringType, StructField, IntegerType

# Driver
spark = SparkSession \
    .builder \
    .master('local') \
    .appName('HelloSpark') \
    .getOrCreate()
# 1. 使用编程的方式
schema = StructType([
    StructField("First", StringType(), False),
    StructField("Last", StringType(), False),
    StructField("Url", StringType(), False),
    StructField("Published", StringType(), False),
    StructField("Hits", IntegerType(), False),
    StructField("Campaigns", ArrayType(StringType()), False),
])

# 2. ddl语言
ddl = "`Id` Int, `First` String, `Last` String, `Url` String, `Published` String, `Hits` INT, `Campaigns` ARRAY<String>"

df = spark.read \
    .schema(ddl) \
    .json("dataset/blogs.txt")

df.printSchema()


